{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.preprocessing import MinMaxScaler, StandardScaler\n",
    "import pandas as pd\n",
    "import numpy as np"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 1、归一化 x' = x-min/max-min      x''=x'*(max-min)+min"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 适用于高精度小规模数据，容易受到异常点的影响"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>milage</th>\n",
       "      <th>Liters</th>\n",
       "      <th>Consumtime</th>\n",
       "      <th>target</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>40920</td>\n",
       "      <td>8.326976</td>\n",
       "      <td>0.953952</td>\n",
       "      <td>3</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>14488</td>\n",
       "      <td>7.153469</td>\n",
       "      <td>1.673904</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>26052</td>\n",
       "      <td>1.441871</td>\n",
       "      <td>0.805124</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>75136</td>\n",
       "      <td>13.147394</td>\n",
       "      <td>0.428964</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>38344</td>\n",
       "      <td>1.669788</td>\n",
       "      <td>0.134296</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   milage     Liters  Consumtime  target\n",
       "0   40920   8.326976    0.953952       3\n",
       "1   14488   7.153469    1.673904       2\n",
       "2   26052   1.441871    0.805124       1\n",
       "3   75136  13.147394    0.428964       1\n",
       "4   38344   1.669788    0.134296       1"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data = pd.read_csv('./data/datingtest/dating.txt')\n",
    "data.head(5)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "partitions = ['milage', 'Liters', 'Consumtime']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>milage</th>\n",
       "      <th>Liters</th>\n",
       "      <th>Consumtime</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>40920</td>\n",
       "      <td>8.326976</td>\n",
       "      <td>0.953952</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>14488</td>\n",
       "      <td>7.153469</td>\n",
       "      <td>1.673904</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>26052</td>\n",
       "      <td>1.441871</td>\n",
       "      <td>0.805124</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>75136</td>\n",
       "      <td>13.147394</td>\n",
       "      <td>0.428964</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>38344</td>\n",
       "      <td>1.669788</td>\n",
       "      <td>0.134296</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   milage     Liters  Consumtime\n",
       "0   40920   8.326976    0.953952\n",
       "1   14488   7.153469    1.673904\n",
       "2   26052   1.441871    0.805124\n",
       "3   75136  13.147394    0.428964\n",
       "4   38344   1.669788    0.134296"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data = data[partitions]\n",
    "data.head(5)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [],
   "source": [
    "mm = MinMaxScaler(feature_range=(0,1))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[[0.44832535 0.39805139 0.56233353]\n",
      " [0.15873259 0.34195467 0.98724416]\n",
      " [0.28542943 0.06892523 0.47449629]\n",
      " ...\n",
      " [0.29115949 0.50910294 0.51079493]\n",
      " [0.52711097 0.43665451 0.4290048 ]\n",
      " [0.47940793 0.3768091  0.78571804]]\n"
     ]
    }
   ],
   "source": [
    "data = mm.fit_transform(data)\n",
    "print(data)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 2、标准化 x' = (x-mean)/σ σ是标准差"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 在已有样本足够多的情况下比较稳定，适合现代嘈杂大数据场景"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>milage</th>\n",
       "      <th>Liters</th>\n",
       "      <th>Consumtime</th>\n",
       "      <th>target</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>40920</td>\n",
       "      <td>8.326976</td>\n",
       "      <td>0.953952</td>\n",
       "      <td>3</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>14488</td>\n",
       "      <td>7.153469</td>\n",
       "      <td>1.673904</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>26052</td>\n",
       "      <td>1.441871</td>\n",
       "      <td>0.805124</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>75136</td>\n",
       "      <td>13.147394</td>\n",
       "      <td>0.428964</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>38344</td>\n",
       "      <td>1.669788</td>\n",
       "      <td>0.134296</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   milage     Liters  Consumtime  target\n",
       "0   40920   8.326976    0.953952       3\n",
       "1   14488   7.153469    1.673904       2\n",
       "2   26052   1.441871    0.805124       1\n",
       "3   75136  13.147394    0.428964       1\n",
       "4   38344   1.669788    0.134296       1"
      ]
     },
     "execution_count": 12,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "dating = pd.read_csv('./data/datingtest/dating.txt')\n",
    "dating.head(5)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [],
   "source": [
    "std = StandardScaler()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>milage</th>\n",
       "      <th>Liters</th>\n",
       "      <th>Consumtime</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>40920</td>\n",
       "      <td>8.326976</td>\n",
       "      <td>0.953952</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>14488</td>\n",
       "      <td>7.153469</td>\n",
       "      <td>1.673904</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>26052</td>\n",
       "      <td>1.441871</td>\n",
       "      <td>0.805124</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>75136</td>\n",
       "      <td>13.147394</td>\n",
       "      <td>0.428964</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>38344</td>\n",
       "      <td>1.669788</td>\n",
       "      <td>0.134296</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>72993</td>\n",
       "      <td>10.141740</td>\n",
       "      <td>1.032955</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>35948</td>\n",
       "      <td>6.830792</td>\n",
       "      <td>1.213192</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>42666</td>\n",
       "      <td>13.276369</td>\n",
       "      <td>0.543880</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8</th>\n",
       "      <td>67497</td>\n",
       "      <td>8.631577</td>\n",
       "      <td>0.749278</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9</th>\n",
       "      <td>35483</td>\n",
       "      <td>12.273169</td>\n",
       "      <td>1.508053</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   milage     Liters  Consumtime\n",
       "0   40920   8.326976    0.953952\n",
       "1   14488   7.153469    1.673904\n",
       "2   26052   1.441871    0.805124\n",
       "3   75136  13.147394    0.428964\n",
       "4   38344   1.669788    0.134296\n",
       "5   72993  10.141740    1.032955\n",
       "6   35948   6.830792    1.213192\n",
       "7   42666  13.276369    0.543880\n",
       "8   67497   8.631577    0.749278\n",
       "9   35483  12.273169    1.508053"
      ]
     },
     "execution_count": 16,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "dating = dating[partitions]\n",
    "dating.head(10)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[[ 0.33193158  0.41660188  0.24523407]\n",
      " [-0.87247784  0.13992897  1.69385734]\n",
      " [-0.34554872 -1.20667094 -0.05422437]\n",
      " ...\n",
      " [-0.32171752  0.96431572  0.06952649]\n",
      " [ 0.65959911  0.60699509 -0.20931587]\n",
      " [ 0.46120328  0.31183342  1.00680598]]\n"
     ]
    }
   ],
   "source": [
    "dating = std.fit_transform(dating)\n",
    "print(dating)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.4"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
